# Train and test classifiers with (1) top-N pos features, (2) top-N word features, and (3) top-N pos_tense_mood_voice features - sorted according to Random Forest's feature importance.

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, average_precision_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import data_loader
import config

import pandas as pd
import numpy as np
import pickle
import csv
import os
import random
seed_value= 42 # random seed of 42 for all experiments
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)


def get_POS_str_filtered(fname, top_n):
    """
    Returns a string of part-of-speech tags in the given filename.
    Tags are filtered based on the top_n most important features.
    """
    df = pd.read_csv(config.BOOK_PATH+fname+'/'+fname+'.tokens', delimiter='\t', quoting=csv.QUOTE_NONE)
    df.fillna("", inplace=True)
    df['pos'] = df['pos'].str.lower()
    df = df.loc[df['pos'].isin(top_pos_features[:top_n])]
    return ' '.join(df['pos'].tolist())

def pos_unigrams_topn(train_x, test_x, top_n):
    """
    Vectorizes the input text using top-N part-of-speech unigrams.

    Parameters
    ----------
    train_x: list of train filenames
    test_x: list of test filenames
    top_n: top N features to consider

    Returns
    -------
    X_train, X_test (sparse matrices) and list of feature_names
    """
    vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', encoding='utf-8')

    train_sentences, test_sentences = [], []
    for x in train_x:
        train_sentences.append(get_POS_str_filtered(x, top_n))
    for x in test_x:
        test_sentences.append(get_POS_str_filtered(x, top_n))
    X_train = vectorizer.fit_transform(train_sentences)
    X_test = vectorizer.transform(test_sentences)
    return X_train, X_test, vectorizer.get_feature_names()



def get_word_str_filtered(fname, top_n):
    """
    Returns a string of part-of-speech tags in the given filename.
    Tags are filtered based on the top_n most important features.
    """
    df = pd.read_csv(config.BOOK_PATH+fname+'/'+fname+'.tokens', delimiter='\t', quoting=csv.QUOTE_NONE)
    df.fillna("", inplace=True)
    df['word'] = df['originalWord'].str.lower()
    df = df.loc[df['word'].isin(top_word_features[:top_n])]
    return ' '.join(df['word'].tolist())

def word_unigrams_topn(train_x, test_x, top_n):
    """
    Vectorizes the input text using top-N word unigrams.

    Parameters
    ----------
    train_x: list of train filenames
    test_x: list of test filenames
    top_n: top N features to consider

    Returns
    -------
    X_train, X_test (sparse matrices) and list of feature_names
    """
    vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', encoding='utf-8')

    train_sentences, test_sentences = [], []
    for x in train_x:
        train_sentences.append(get_word_str_filtered(x, top_n))
    for x in test_x:
        test_sentences.append(get_word_str_filtered(x, top_n))
    X_train = vectorizer.fit_transform(train_sentences)
    X_test = vectorizer.transform(test_sentences)
    
    return X_train, X_test, vectorizer.get_feature_names()


def get_tmv_vector(fname, feats_to_consider):
    """
    Returns a feature vector for the given filename.
    It only includes the features present in the feats_to_consider list (in that order).
    """
    return [TMV_FEATURES[fname][feat] for feat in feats_to_consider]
    
def get_POS_str_feats(fname, pos_feats_to_consider):
    """
    Returns a string of part-of-speech tags in the given filename.
    Tags are filtered based on pos_feats_to_consider.
    """
    df = pd.read_csv(config.BOOK_PATH+fname+'/'+fname+'.tokens', delimiter='\t', quoting=csv.QUOTE_NONE)
    df.fillna("", inplace=True)
    df['pos'] = df['pos'].str.lower()
    df = df.loc[df['pos'].isin(pos_feats_to_consider)]
    return ' '.join(df['pos'].tolist())

def pos_tmv_topn(train_x, test_x, top_n):
    """
    Vectorizes the input text using top-N part-of-speech unigrams & tense-mood-voice.

    Parameters
    ----------
    train_x: list of train filenames
    test_x: list of test filenames
    top_n: top N features to consider

    Returns
    -------
    X_train, X_test (sparse matrices) and list of feature_names
    """
    features_to_consider = top_pos_tmv_features[:top_n]
    
    if 'coherence' in features_to_consider:
        features_to_consider.remove('coherence')
        features_to_consider.append('coh_seq')
    
    tmv_feats = list(set(features_to_consider).intersection(tmv_feature_names))
    pos_feats = list(set(features_to_consider) - set(tmv_feats))
    print("Features to consider: {} | TMV: {} | POS: {}".format(features_to_consider, tmv_feats, pos_feats))
    
    # TMV:
    tmv_train, tmv_test = [], []
    for x in train_x:
        tmv_train.append(get_tmv_vector(x, tmv_feats))
    for x in test_x:
        tmv_test.append(get_tmv_vector(x, tmv_feats))

    if len(pos_feats) == 0:
        return np.array(tmv_train), np.array(tmv_test), tmv_feats
    
    # POS:
    vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', encoding='utf-8')
    train_sentences, test_sentences = [], []
    for x in train_x:
        train_sentences.append(get_POS_str_feats(x, pos_feats))
    for x in test_x:
        test_sentences.append(get_POS_str_feats(x, pos_feats))
    pos_train = vectorizer.fit_transform(train_sentences)
    pos_test = vectorizer.transform(test_sentences)
    
    # Combine:
    combined_train = np.hstack((tmv_train, pos_train.toarray()))
    combined_test = np.hstack((tmv_test, pos_test.toarray()))
    return combined_train, combined_test, tmv_feats+vectorizer.get_feature_names()


def tmv_topn(train_x, test_x, top_n):
    """
    Vectorizes the input text using top-N tense-mood-voice features.

    Parameters
    ----------
    train_x: list of train filenames
    test_x: list of test filenames
    top_n: top N features to consider

    Returns
    -------
    X_train, X_test (sparse matrices) and list of feature_names
    """
    tmv_feats = top_tmv_features[:top_n]
    if 'coherence' in tmv_feats:
        tmv_feats.remove('coherence')
        tmv_feats.append('coh_seq')
    print("TMV Features to consider: {}".format(tmv_feats))
    
    tmv_train, tmv_test = [], []
    for x in train_x:
        tmv_train.append(get_tmv_vector(x, tmv_feats))
    for x in test_x:
        tmv_test.append(get_tmv_vector(x, tmv_feats))
    return np.array(tmv_train), np.array(tmv_test), tmv_feats


def run_with_topn(funct, top_n):
    """
    Fit and Predict with top_n most important features.
    funct: pos_unigrams_topn or word_unigrams_topn
    
    Returns the f1-score, precision, recall, accuracy averaged across 5-folds.
    """
    f1s = []; AUROCs = []; weighted_f1s = []; precision_s = []; recall_s = []; accuracies = []; AUPRCs = []

    split_no = 1
    skf = StratifiedKFold(n_splits=folds) # splits the data into stratified folds
    for train_indices, test_indices in skf.split(X=np.zeros(len(Y)), y=Y): # only really need Y for splitting
        X_train, X_test, feat_names = funct(train_x=fnames[train_indices],
                                            test_x=fnames[test_indices],
                                            top_n=top_n)

        y_train = Y[train_indices]
        y_test = Y[test_indices]
        clf = algo.fit(X_train, y_train)
        preds = clf.predict(X_test)
        assert clf.classes_.tolist()[0] == 'NEG' # make sure that the class ordering is ['NEG' 'POS']

        # Compute classification metrics:
        f1 = f1_score(y_test, preds, pos_label="POS")
        precision = precision_score(y_test, preds, pos_label="POS")
        recall = recall_score(y_test, preds, pos_label="POS")
        acc = accuracy_score(y_test, preds)
        f1s.append(f1); precision_s.append(precision); recall_s.append(recall); accuracies.append(acc)
        print("Split number: {} | Train: {} & {} | Test: {} & {} | Feature Names = {}".format(split_no, X_train.shape, y_train.shape, X_test.shape, y_test.shape, feat_names))
        split_no += 1

    # Compute mean:
    f1s = np.array(f1s); precision_s = np.array(precision_s); recall_s = np.array(recall_s); accuracies = np.array(accuracies)
    mean_f1 = f1s.mean(); mean_precision = precision_s.mean(); mean_recall = recall_s.mean(); mean_accuracy = accuracies.mean()
    return round(mean_f1, 4), round(mean_precision, 4), round(mean_recall, 4), round(mean_accuracy, 4)

if __name__ == '__main__':
    RUN_FOR = 'pos_tmv' # 'pos' or 'pos_tmv' or 'word' or 'tmv'
    # Note: some filenames have been changed for "run2"
    print("\n\nRunning For:", RUN_FOR)
    
    p_fname = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/pickles/tense_mood_voice_features_lite.pickle'
    print("\nLoading TMV features from:", p_fname)
    with open(p_fname, 'rb') as f:
        TMV_FEATURES = pickle.load(f) # created via pickle_features.py
        
    folds = 5
    path = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/'
    algo = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42) # the best pos-tmv parameters
    tmv_feature_names = ['temporality', 'temporal_order', 'setting', 'concreteness', 'saying', 'eventfulness', 'agenthood', 'agency', 'coh_seq', 'feltness']


    fnames, Y = data_loader.load_data(discard_genres=['OPINION'], remove_annotated_passages=True, remove_mispreds=True)
    print("Dataset:", len(fnames), len(Y), "|", Counter(Y))

    if RUN_FOR == 'pos':
        top_pos_features = pd.read_csv(path+'feature-importance/pos-unigrams.csv')['feature_names'].tolist()
        results_file = open(path+'results/pos_topn_results.txt', 'w')
        results_file.write("Top-N\tFeatures\tF1-score\tPrecision\tRecall\tAccuracy\n")
        for N in range(1, 30):
            print("\n-------------\nN = ", N)
            f1, prec, rec, acc = run_with_topn(pos_unigrams_topn, N)
            results_file.write(str(N)+'\t'+str(top_pos_features[:N])+'\t'+str(f1)+'\t'+str(prec)+'\t'+str(rec)+'\t'+str(acc)+'\n')

    elif RUN_FOR == 'word':
        top_word_features = pd.read_csv(path+'feature-importance/word-unigrams.csv')['feature_names'].tolist()
        results_file = open(path+'results/word_topn_results.txt', 'w')
        results_file.write("Top-N\tFeatures\tF1-score\tPrecision\tRecall\tAccuracy\n")
        for N in range(1, 30):
            print("\n-------------\nN = ", N)
            f1, prec, rec, acc = run_with_topn(word_unigrams_topn, N)
            results_file.write(str(N)+'\t'+str(top_word_features[:N])+'\t'+str(f1)+'\t'+str(prec)+'\t'+str(rec)+'\t'+str(acc)+'\n')
            
    elif RUN_FOR == 'pos_tmv':
        top_pos_tmv_features = pd.read_csv(path+'feature-importance/run2_pos-tmv.csv')['feature_names'].tolist()
        results_file = open(path+'results/postmv_topn_results_data'+str(len(fnames))+'_part3.txt', 'w')
        results_file.write("Top-N\tFeatures\tF1-score\tPrecision\tRecall\tAccuracy\n")
        for N in range(30, 50):
            print("\n-------------\nN = ", N)
            f1, prec, rec, acc = run_with_topn(pos_tmv_topn, N)
            results_file.write(str(N)+'\t'+str(top_pos_tmv_features[:N])+'\t'+str(f1)+'\t'+str(prec)+'\t'+str(rec)+'\t'+str(acc)+'\n')

    elif RUN_FOR == 'tmv':
        top_tmv_features = pd.read_csv(path+'feature-importance/tense-mood-voice.csv')['feature_names'].tolist()
        results_file = open(path+'results/tmv_topn_results.txt', 'w')
        results_file.write("Top-N\tFeatures\tF1-score\tPrecision\tRecall\tAccuracy\n")
        for N in range(1, 15):
            print("\n-------------\nN = ", N)
            f1, prec, rec, acc = run_with_topn(tmv_topn, N)
            results_file.write(str(N)+'\t'+str(top_tmv_features[:N])+'\t'+str(f1)+'\t'+str(prec)+'\t'+str(rec)+'\t'+str(acc)+'\n')
            
    results_file.close()
